// SPDX-License-Identifier: GPL-2.0 or GPL-3.0
// Copyright © 2019 Ariadne Devos
/* sHT -- find lexeme boundaries */

#include <sHT/bitvec.h>
#include <sHT/index.h>
#include <sHT/lex.h>
#include <sHT/string.h>
#include <sHT/taint.h>
#include <sHT/test.h>

size_t
sHT_lex(struct sHT_lex_buf *to, const unsigned char from[], size_t n, const struct sHT_lex_type *c)
{
	/* The current offset into @var{from}. (Set later). */
	size_t i;
	/* The old number of accumulated bytes,
	   therefore, the index into @var{to->bytes} to start writing to. */
	size_t offset = to->offset;
	/* The following loop: iterate over the bytes of @var{from}, to
	  validate their syntax and copy them -- and process a fragment when
	  complete. However, not all bytes, as only @code{c->max_known} are
	  allocated.

	  @var{offset}: data from previous runs is remembered. */
	/* Underflow 1: `sHT_lex_buf` invariant (`offset` < `max_known`) */
	size_t todo = sHT_min_size(n, c->max_known - offset);
	/* Induct over byte locations, until a space character, a syntax error
	  or the method is found to be too long to be known. */
	/* (1) `n` > 0 (`sHT_lex` precondition)
	  (1) => (2) 0 < `n`
	  (3) `offset` < `max_known` (`sHT_lex_buf` invariant)
	  (3) => (4) 0 < `max_known` - `offset`
	  (2, 4) => (5) 0 < min(`n`, `max_known` - `offset`)
	  (5): QED `sHT_index_iterate` positivity (TODO: do ... while loop) */

	/* Invariant: byte `offset` to `offset` + `i` (exclusive) of
	  @code{buf->bytes} are set. Base case: trivial. */
	sHT_index_iterate(i, todo) {
		/* If zero @var{n} were allowed, this would be out of bounds */
		/* (1) i < todo (@var{sHT_index_iterate})
		  (2) todo <= n (@var{sHT_min_size})
		  (1, 2) => i < n
		  QED @var{from} length */
		uint8_t b = from[i];
		/* (1) i < todo (@var{sHT_index_iterate})
		  (2) todo <= max_known - offset (@var{sHT_min_size})
		  (1, 2) => (3) i < max_known - offset
		  (3) => offset + i < max_known
		  QED @var{to} capacity.

		  QED induction step (is set). */
		to->bytes[offset + i] = b;
		if (sHT_bit_test(c->c_allow, b)) {
			/* Correct byte, but not a terminator.
			  Continue the search. */
			continue;
		}
		/* Non-speculatively, @var{b} is not one of the allowed
		  bytes. Either it is the terminator, or a syntax error.
		  Which one? (0: syntax error, 1: terminator)*/
		int which = sHT_eq_bool(c->c_stop, b);
		/* Not used anymore; taint for analysis */
		sHT_taint(&to->offset);
		/* +1: also count the terminating byte
		  (<tests/lex.c> found this bug)

		  (1) i < todo (@var{sHT_index_iterate}),
		  (2) todo <= n (@var{sHT_index_iterate})
		  (1, 2) => (3) i < n.
		  (3) => (4) i + 1 <= n
		  QED bounds last argument

		  (1) i < todo (@var{sHT_index_iterate}),
		  (2) todo <= max_known - offset (@var{sHT_min_size})
		  (1, 2) => (3) i <= max_known - offset
		  (3) => (4) offset + i <= max_known

		  QED length/index bounds */
		return c->cb_value[which](to, offset + i, i + 1);
	}

	/* Compare the number of running total of tested bytes with the
	  maximal known lexeme length. If it the former begins to equal
	  the latter, there is no point in copying anymore, but the
	  syntax must still be validated. */
	/* Overflow:

	  (1) i < todo (@var{sHT_index_iterate}),
	  (2) todo <= max_known - offset (@var{sHT_min_size})
	  (1, 2) => (3) i <= max_known - offset
	  (3) => (4) offset + i <= max_known
	  (5) `max_known` <= `SIZE_MAX` (typing)
	  (3, 5) => `offset` + `i` <= `SIZE_MAX`

	  QED no overflow */
	if (sHT_ge(offset + i, c->max_known)) {
		/* Not used anymore; taint for analysis */
		sHT_taint(&to->offset);
		sHT_taint(&to->bytes[0]);
		return c->cb_ignore(to, i);
	}

	/* More bytes must be read before the lexeme is complete.
	  Proof of progress (i = n) (non-speculatively):

	  (1) offset + i < max_known (@var{sHT_ge})
	  (2) i = todo (@var{sHT_index_iterate})
	  (1) => (4) i < max_known - offset
	  (2, 4) => (5) todo < max_known - offset
	  (5) => todo = n (@var{sHT_min_size})
	  (2, 5) => i = n

	  QED progress */
	/* Remember the number of copied bytes */
 	/* Overflow/bounds:

	  (1) i <= todo (@var{sHT_index_iterate}),
	  (2) todo <= max_known - offset (@var{sHT_min_size})
	  (1, 2) => (3) i <= max_known - offset
	  (3) => (4) offset + i <= max_known
	  QED bounds; continue overflow

	  (5) max_known < UINT16_MAX (@var{uint16_t})
	  (4, 5) => offset + i < UINT16_MAX

	  QED overflow */
	to->offset += i;
	/* Bounds:

	  (1) i <= todo (@var{sHT_index_iterate})
	  (2) todo <= n (@var{sHT_min_size})
	  (1, 2) => i <= n

	  QED bounds */
	return i;
}
